library(dplyr)
library(tidyr)
library(stringr)
library(readr) # read csv
library(ggpubr) # combine multiple graphs into one
library(ggplot2)
library(lubridate)
library(Polychrome) # color palette
library(knitr)
library(kableExtra)
library(gridExtra)
# set working directory
setwd(setwd("~/Documents/GitHub/code-ownership-analysis"))
tensorflow <- read_csv("./data/tensorflow_commit_history.csv")
keras <- read_csv("./data/keras_commit_history.csv")
pytorch <- read_csv("./data/pytorch_commit_history.csv")
vscode <- read_csv("./data/vscode_commit_history.csv")
powertoys <- read_csv("./data/PowerToys_commit_history.csv")
react <- read_csv("./data/react_commit_history.csv")
reactNative <- read_csv("./data/react-native_commit_history.csv")
createReactApp <- read_csv("./data/create-react-app_commit_history.csv")
core <- read_csv("./data/core_commit_history.csv")
flutter <- read_csv("./data/flutter_commit_history.csv")
azureDocs <- read_csv("./data/azure-docs_commit_history.csv")
stableDiffusionWebUI <- read_csv("./data/stable-diffusion-webui_commit_history.csv")
nextJs <- read_csv("./data/next.js_commit_history.csv")
langchain <- read_csv("./data/langchain_commit_history.csv")
When reading the commit data from these files, R will throw warning messages because of missing values. The following code chunk inspects what values are missing.
Combine the data of all repositories into a single data frame. Here I
also create a new column called repo to indicate from which
repository the data is from.
all_repos <- bind_rows(
tensorflow %>% mutate(repo='tensorflow'),
keras %>% mutate(repo='keras'),
pytorch %>% mutate(repo='pytorch'),
vscode %>% mutate(repo='vscode'),
powertoys %>% mutate(repo='powertoys'),
react %>% mutate(repo='react'),
reactNative %>% mutate(repo='reactNative'),
createReactApp %>% mutate(repo='createReactApp'),
core %>% mutate(repo='core'),
flutter %>% mutate(repo='flutter'),
azureDocs %>% mutate(repo='azureDocs'),
stableDiffusionWebUI %>% mutate(repo='stableDiffusionWebUI'),
nextJs %>% mutate(repo='nextJs'),
langchain %>% mutate(repo='langchain'))
Extract yaml commits inside the .github/workflows
directory. The input data frame needs to be structured so that each row
represents a yaml file from a commit. If a commit features multiple yaml
files, it is split into multiple rows.
# helper function
extract_yaml_files <- function(commits_df){
yamlFiles <- commits_df %>%
filter(
str_starts(file, ".github/workflows/") &
str_ends(file, ".yml")) %>%
mutate(file = gsub(".github/workflows/","",file)) %>% # remove path from file name
mutate(file = gsub(".yml|.yaml","",file)) # remove file ending to shorten the names
return(yamlFiles)
}
# TensorFlow
tensorflow_yml_commits <- extract_yaml_files(tensorflow)
# Keras
keras_yml_commits <- extract_yaml_files(keras)
# Pytorch
pytorch_yml_commits <- extract_yaml_files(pytorch)
# VScode
vscode_yml_commits <- extract_yaml_files(vscode)
# PowerToys
powertoys_yml_commits <- extract_yaml_files(powertoys)
# React
react_yml_commits <- extract_yaml_files(react)
# React-Native
reactNative_yml_commits <- extract_yaml_files(reactNative)
# Create-React-App
createReactApp_yml_commits <- extract_yaml_files(createReactApp)
# Core
core_yml_commits <- extract_yaml_files(core)
# Flutter
flutter_yml_commits <- extract_yaml_files(flutter)
# Azure-Docs
azureDocs_yml_commits <- extract_yaml_files(azureDocs)
# stableDiffusionWebUI
stableDiffusionWebUI_yml_commits <- extract_yaml_files(stableDiffusionWebUI)
# nextJs
nextJs_yml_commits <- extract_yaml_files(nextJs)
# React
langchain_yml_commits <- extract_yaml_files(langchain)
Combine all yml files into a single data frame
all_yml_files <- bind_rows(
tensorflow_yml_commits %>% mutate(repo='tensorflow'),
keras_yml_commits %>% mutate(repo='keras'),
pytorch_yml_commits %>% mutate(repo='pytorch'),
vscode_yml_commits %>% mutate(repo='vscode'),
powertoys_yml_commits %>% mutate(repo='powertoys'),
react_yml_commits %>% mutate(repo='react'),
reactNative_yml_commits %>% mutate(repo='reactNative'),
createReactApp_yml_commits %>% mutate(repo='createReactApp'),
core_yml_commits %>% mutate(repo='core'),
flutter_yml_commits %>% mutate(repo='flutter'),
azureDocs_yml_commits %>% mutate(repo='azureDocs'),
stableDiffusionWebUI_yml_commits %>% mutate(repo='stableDiffusionWebUI'),
nextJs_yml_commits %>% mutate(repo='nextJs'),
langchain_yml_commits %>% mutate(repo='langchain'),
)
This plot shows the number of authors and commits per file.
plot_commit_contribution <- function(df, repoName) {
# get number of unique values
ymlFilesCount <- n_distinct(df$file)
ymlAuthorCount <- n_distinct(df$author)
# set thresholds
thresholdFiles <- 10
thresholdAuthors <- 20
# Filter df to include top 'thresholdFiles' files with most commits, or all if fewer exist
top_files <- df %>%
# count occurrences of each file
count(file, name = "count") %>%
# sort files by their number of commits
arrange(desc(count)) %>%
# use slice_head to get exactly the number of files even if there are ties
# if number of files exceeds threshold, use threshold as max number of files to plot
slice_head(n = min(ymlFilesCount, thresholdFiles))
plot_data <- df %>%
filter(file %in% top_files$file)
plot <- plot_data %>%
group_by(file, author) %>%
summarise(count = n()) %>%
ggplot(., aes(fill = author, x = reorder(file, count, sum), y = count, label = count)) +
geom_bar(position ="stack", stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
labs(
x = "Yaml Files",
y = "Number of Commits",
title = paste("Commit Contribution for YAML Files in", repoName),
# show different text based on number of files in df
caption = ifelse(ymlFilesCount > thresholdFiles,
paste("Among ", ymlFilesCount ,"total YAML files, this graph only includes", thresholdFiles ,
"files with the most commits."),
paste("Total of", ymlFilesCount ," YAML files from",ymlAuthorCount,"unique authors."))
)+
theme_bw() +
theme(axis.text.x = element_text(angle = 20, vjust = 1, hjust=1),
plot.caption = element_text(hjust=0),
# display or hide legend based on number of authors
legend.position = ifelse(
ymlAuthorCount > thresholdAuthors,
"none",
"bottom")
)
return(plot)
}
# test function with react repo
#plot_commit_contribution(react_yml_commits, "React")
This plot shows the number of commits over the time from the first (yaml file) commit in the workflow directory up to the date where the data was fetched.
plot_commit_timeline <- function(df, repoName) {
# get number of unique values
yearFirstCommit <- min(year(df$date))
dateFirstCommit <- format(min(as.Date(df$date)), "%B %d, %Y")
yearLastCommit <- max(year(df$date))
dateLastCommit <- format(max(as.Date(df$date)), "%B %d, %Y")
totalCommits <- n_distinct(df$hash)
# Calculate max commits on single day
maxDailyCommit <- df %>%
count(date = as.Date(date)) %>% # by using as.Date we remove the time from the date
pull(n) %>%
max()
# Calculate number of ticks on y-axis
maxTicks <- 10
tickInterval <- ceiling(maxDailyCommit / maxTicks) # round the result to the next bigger int: 2.7 -> 3
plot <- df %>%
group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(x = as.Date(date), y = count)) +
geom_col(color = "#00AFBB", fill="#00AFBB") +
scale_y_continuous(breaks = seq(0, maxDailyCommit, by = tickInterval), limits = c(0, maxDailyCommit)) +
labs(
x = "Date",
y = "Number of Commits",
title = paste0("YAML File Commit Activity Over Time in the ",repoName ," Repository (",yearFirstCommit,"-",yearLastCommit,")"),
caption = paste0("The data represents a total of ",totalCommits, " commits recorded from ",
dateFirstCommit, " to ",dateLastCommit,". \nNote: Individual bars represent the total commits per day; gaps indicate days with no recorded activity." )) +
theme_bw() +
theme(plot.caption = element_text(hjust=0))
return(plot)
}
# test function with react repo
#plot_commit_timeline(react_yml_commits, "React")
This plot shows an ownership map where you can see who committed when to a file and who was the owner of it at what point in time.
plot_ownership_map <- function(df, repoName) {
# Create variables to display in plot text
yearFirstCommit <- min(year(df$date))
yearLastCommit <- max(year(df$date))
ymlFilesCount <- n_distinct(df$file)
ymlAuthorCount <- n_distinct(df$author) # do I need this?
# set thresholds
thresholdFiles <- 40
# Filter df to include top 'thresholdFiles' files with most commits, or all if fewer exist
top_files <- df %>%
# count occurrences of each file
count(file, name = "count") %>%
# sort files by their number of commits
arrange(desc(count)) %>%
# use slice_head to get exactly the number of files even if there are ties
# if number of files exceeds threshold, use threshold as max number of files to plot
slice_head(n = min(ymlFilesCount, thresholdFiles))
# Count the number of commits for each author and file up to each point in time
commits_cumulative <- df %>%
filter(file %in% top_files$file) %>% # filter files to reduce number of files in plot
group_by(file, author) %>%
arrange(date, .by_group = TRUE) %>%
mutate(commit_count = row_number()) %>%
ungroup()
# Initialize the new data frame
result_df <- data.frame(file = character(),
date = as.Date(character()),
author = character(),
leadingAuthor = character(),
stringsAsFactors = FALSE)
# Loop through each row to determine the leading author
for(i in 1:nrow(commits_cumulative)) {
current_file <- commits_cumulative$file[i]
current_date <- commits_cumulative$date[i]
# Subset the original data frame for the current file and dates up to the current date
subset_df <- commits_cumulative[commits_cumulative$file == current_file &
commits_cumulative$date <= current_date, ]
# Determine the leading author (the one with the most commits, and in case of a tie, the most recent)
leading_author <- subset_df %>%
group_by(author) %>%
summarise(commit_count = n(), latest_commit = max(date)) %>%
arrange(desc(commit_count), desc(latest_commit)) %>%
slice(1) %>%
pull(author)
# Append to the result data frame
result_df <- rbind(result_df,
data.frame(file = current_file, date = current_date,
author = commits_cumulative$author[i],
leadingAuthor = leading_author))
}
# Assign unique colors to each author using a custom color palette
num_authors <- unique(result_df$author)
seedcolors <- c(
"#E63946", "#F4A261", "#2A9D8F", "#D90429", "#E76F51",
"#2A6F97", "#F77F00", "#80B918", "#9A031E", "#D62828",
"#023E7D", "#5F0F40", "#9E0059", "#F8961E", "#3A0CA3",
"#E9C46A", "#8D99AE", "#006D77", "#EF476F", "#F72585")
colors <- createPalette(length(num_authors), seedcolors = seedcolors)
names(colors) <- num_authors
plot <- result_df %>%
ggplot(aes(x = date, y = file, group = file)) +
# Add lines that change color based on the leading author
geom_line(aes(color = leadingAuthor), linewidth = 1, alpha = 1) +
# Add points for each commit
geom_point(aes(color = author), size = 3, alpha = 0.6) +
# Set the colors for the authors
scale_color_manual(values = colors) +
# Adjust y-axis to show truncated file names
scale_y_discrete(labels = function(x) sapply(x, function(y) {
# shorten all file names longer than 10 characters
ifelse(nchar(y) > 10, paste0(substr(y, 1, 10), "..."), y)
})) +
labs(
x = "Date",
y = "Files",
color = "Authors",
title = paste0("Ownership Map of YAML Files from ",repoName ," Repository (",yearFirstCommit,"-",yearLastCommit,")"),
caption = ifelse(ymlFilesCount > thresholdFiles,
paste0("From a total of ", ymlFilesCount ," YAML files, this graph only includes ", thresholdFiles ,
" files with the most commits. \n Line colors denote the current leading contributor (owner) for each file."),
paste0("Total of ", ymlFilesCount, " YAML files. Line colors denote the current leading contributor (owner) for each file."))) +
theme_bw() +
theme(
legend.position = "none", # hide the legend for spacing reasons
#legend.box.just = "left",
# Remove top margin of legend
#legend.margin = margin(t = 0, unit = "cm"),
# Shift legend box to the left
#legend.box.margin = margin(l = -3, unit = "cm"),
# Shift caption to the left
plot.caption = element_text(hjust = -0.1, vjust = 1))# +
#guides(color=guide_legend(ncol=7))
return(plot)
}
# test function with react repo
#plot_ownership_map(react_yml_commits, "React")
Create similar plot as the ownership map but only for a single file and display additions and deletions for each commit. This function takes a df from the hole repo together with its name, as well as the name of the file of interest as input.
plot_file_evolution <- function(df, repoName, fileName) {
# Filter df for specific file
filtered_df <- df %>% filter(file==fileName)
# get number of unique values
ymlFilesCount <- n_distinct(filtered_df$file)
ymlAuthorCount <- n_distinct(filtered_df$author)
commitCount <- nrow(filtered_df)
# set thresholds
thresholdCommits <- 40
# Assign unique colors to each author
author_colors <- setNames(rainbow(length(unique(filtered_df$author))), unique(filtered_df$author))
plot_data <- filtered_df %>%
mutate(net_change = as.numeric(additions) - as.numeric(deletions)) %>%
arrange(date, hash) %>%
mutate(commit_order = row_number())
plot <- plot_data %>%
ggplot(aes(x = commit_order,
y = net_change,
fill = ifelse(net_change > 0, 'Positive', 'Negative'))) +
geom_bar(position ="stack", stat = "identity", width = 1) +
scale_fill_manual(values = c("Positive" = "darkgreen", "Negative" = "darkred")) +
scale_x_continuous(name = "Commit Sequence", breaks = NULL) + # remove x-axis breaks
scale_y_continuous(name = "Lines Changed",
breaks = scales::pretty_breaks(n = 10),
labels = scales::comma) +
geom_bar(aes(color=factor(author)), position="stack", stat="identity", show.legend = FALSE) + # Add border color
scale_color_manual(values = author_colors) +
geom_hline(yintercept = 0) + # Add zero line
labs(
title = paste0("Commit Contributions for file \"", fileName,"\" in ", repoName),
# show different text based on number of files in df
caption = paste("Total of", commitCount ," commits from",ymlAuthorCount,"unique authors.")) +
theme_bw() +
theme(axis.ticks.x = element_blank(),
plot.caption = element_text(hjust=0),
legend.position = "none")
return(plot)
}
# test function with react repo
#plot_file_evolution(react_yml_commits, "React", "commit_artifacts")
# Summarize all repository data
repo_summary <- all_repos %>%
# remove commits without commit date
filter(!is.na(date)) %>%
group_by(repo) %>%
summarize(
"commits" = n_distinct(hash),
"authors" = n_distinct(author),
"first commit" = min(as.Date(date)),
"last commit" = max(as.Date(date))
)
# Summarize YAML specific data
yaml_summary <- all_yml_files %>%
# remove commits without commit date
filter(!is.na(date)) %>%
group_by(repo) %>%
summarize(
yml_commits = n(),
yml_authors = n_distinct(author),
"first yml commit" = min(as.Date(date))
)
# Combine summaries
final_table <- left_join(repo_summary, yaml_summary, by = "repo") %>%
select(repo, "commits", "authors", yml_commits, yml_authors, "first commit", "last commit", "first yml commit") %>%
arrange(desc(commits))
# Add a row for total sums
totals <- final_table %>%
summarize(
repo = "Total",
"commits" = sum(commits, na.rm = TRUE) %>% format(big.mark = "'", scientific = FALSE),
"authors" = sum(authors, na.rm = TRUE) %>% format(big.mark = "'", scientific = FALSE),
yml_commits = sum(yml_commits, na.rm = TRUE) %>% format(big.mark = "'", scientific = FALSE),
# get total unique authors not total authors
yml_authors = sum(yml_authors, na.rm = TRUE) %>% format(big.mark = "'", scientific = FALSE),
"first commit" = "",
"last commit" = "",
"first yml commit" = ""
)
# Append totals row to the final table
final_table_with_totals <- rbind(final_table, totals)
# Create the table
kable(final_table_with_totals, align = "c",caption = "Repository Summary")
| repo | commits | authors | yml_commits | yml_authors | first commit | last commit | first yml commit |
|---|---|---|---|---|---|---|---|
| azureDocs | 779876 | 17351 | 89 | 18 | 1970-01-01 | 2023-12-15 | 2019-09-11 |
| core | 353152 | 7139 | 1018 | 44 | 2013-09-17 | 2023-12-15 | 2020-02-25 |
| pytorch | 342885 | 6149 | 40182 | 303 | 2012-01-25 | 2023-12-12 | 2019-09-06 |
| flutter | 255656 | 3114 | 435 | 29 | 2014-10-23 | 2023-12-15 | 2019-09-05 |
| tensorflow | 197339 | 6218 | 334 | 46 | 2015-11-06 | 2023-12-12 | 2019-08-28 |
| vscode | 161264 | 4197 | 1709 | 99 | 2015-11-13 | 2023-12-12 | 2019-09-21 |
| reactNative | 78949 | 5173 | 221 | 32 | 2015-01-29 | 2023-12-12 | 2019-08-22 |
| nextJs | 78900 | 5467 | 2430 | 77 | 2016-10-05 | 2023-12-15 | 2019-08-09 |
| react | 45913 | 3915 | 133 | 30 | 2013-05-28 | 2023-12-12 | 2019-11-30 |
| langchain | 39519 | 3403 | 976 | 48 | 2022-10-16 | 2023-12-15 | 2022-10-20 |
| powertoys | 27167 | 749 | 141 | 21 | 2013-12-19 | 2023-12-12 | 2020-01-27 |
| keras | 24293 | 2321 | 81 | 17 | 2015-03-21 | 2023-12-12 | 2020-02-11 |
| createReactApp | 16141 | 2854 | 185 | 38 | 2016-07-15 | 2023-12-03 | 2019-10-20 |
| stableDiffusionWebUI | 11157 | 1117 | 26 | 11 | 2022-08-22 | 2023-12-15 | 2022-08-24 |
| Total | 2’412’211 | 69’167 | 47’960 | 813 | NA | NA | NA |
plot_commit_timeline(tensorflow_yml_commits, "TensorFlow")
plot_ownership_map(tensorflow_yml_commits, "TensorFlow")
plot_file_evolution(tensorflow_yml_commits, "TensorFlow", "arm-ci")
plot_commit_timeline(keras_yml_commits, "Keras")
plot_ownership_map(keras_yml_commits, "Keras")
plot_file_evolution(keras_yml_commits, "Keras", "actions")
plot_commit_timeline(pytorch_yml_commits, "Pytorch")
plot_ownership_map(pytorch_yml_commits, "Pytorch")
plot_file_evolution(pytorch_yml_commits, "Pytorch", "_win-test")
plot_commit_timeline(vscode_yml_commits, "VS Code")
plot_ownership_map(vscode_yml_commits, "VS Code")
plot_file_evolution(vscode_yml_commits, "VS Code", "ci")
plot_commit_timeline(powertoys_yml_commits, "Powertoys")
plot_ownership_map(powertoys_yml_commits, "Powertoys")
plot_file_evolution(powertoys_yml_commits, "Powertoys", "package-submissions")
plot_commit_timeline(react_yml_commits, "React")
plot_ownership_map(react_yml_commits, "React")
plot_file_evolution(react_yml_commits, "React", "devtools_check_repro")
plot_commit_timeline(reactNative_yml_commits, "React Native")
plot_ownership_map(reactNative_yml_commits, "React Native")
plot_file_evolution(reactNative_yml_commits, "React Native", "on-issue-labeled")
plot_commit_timeline(createReactApp_yml_commits, "Create-React-App")
plot_ownership_map(createReactApp_yml_commits, "Create-React-App")
plot_file_evolution(createReactApp_yml_commits, "Create-React-App", "build")
plot_commit_timeline(core_yml_commits, "Core")
plot_ownership_map(core_yml_commits, "Core")
plot_file_evolution(core_yml_commits, "Core", "builder")
plot_commit_timeline(flutter_yml_commits, "Flutter")
plot_ownership_map(flutter_yml_commits, "Flutter")
plot_file_evolution(flutter_yml_commits, "Flutter", "scorecards-analysis")
plot_commit_timeline(azureDocs_yml_commits, "Azure Docs")
plot_ownership_map(azureDocs_yml_commits, "Azure Docs")
plot_file_evolution(azureDocs_yml_commits, "Azure Docs", "stale")
plot_commit_timeline(stableDiffusionWebUI_yml_commits, "StableDiffusionWebUI")
plot_ownership_map(stableDiffusionWebUI_yml_commits, "StableDiffusionWebUI")
plot_file_evolution(stableDiffusionWebUI_yml_commits, "StableDiffusionWebUI", "python-package-conda")
plot_commit_timeline(nextJs_yml_commits, "Next.Js")
## Warning: Removed 1 rows containing missing values (`position_stack()`).
plot_ownership_map(nextJs_yml_commits, "Next.Js")
plot_file_evolution(nextJs_yml_commits, "Next.Js", "build_test_deploy")
plot_commit_timeline(langchain_yml_commits, "Langchain")
plot_ownership_map(langchain_yml_commits, "Langchain")
plot_file_evolution(langchain_yml_commits, "Langchain", "codespell")